This project aims to present a brief analysis of the booking patterns and sales status to assist the management in determining the different types of guests they receive and to assess the status of such bookings which in turn will be utilised to make a predictive model on whether a booking will be canceled or not based on the different variables available.
The dataset contains 32 different variables which were analysed and detailed insights were generated which the management can utilise to plan and promote their hotels to prospective customers.
library(readxl)
library(skimr)
library(DT)
library(plotly)
library(tidyverse)
library(caTools)
library(caret)
data_dictionary <- read_xlsx("Hospitality_Analysis.xlsx")
datatable(
head(data_dictionary,40),
extensions = 'FixedColumns',
options = list(
scrollY = "400px",
scrollX = TRUE,
fixedColumns = TRUE
)
)
df <- read_xlsx("Hospitality_Analysis.xlsx",sheet=2)
data.frame(head(df))
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1 Resort Hotel 0 342 2015 July
## 2 Resort Hotel 0 737 2015 July
## 3 Resort Hotel 0 7 2015 July
## 4 Resort Hotel 0 13 2015 July
## 5 Resort Hotel 0 14 2015 July
## 6 Resort Hotel 0 14 2015 July
## arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1 27 1 0
## 2 27 1 0
## 3 27 1 0
## 4 27 1 0
## 5 27 1 0
## 6 27 1 0
## stays_in_week_nights adults children babies meal country market_segment
## 1 0 2 0 0 BB PRT Direct
## 2 0 2 0 0 BB PRT Direct
## 3 1 1 0 0 BB GBR Direct
## 4 1 1 0 0 BB GBR Corporate
## 5 2 2 0 0 BB GBR Online TA
## 6 2 2 0 0 BB GBR Online TA
## distribution_channel is_repeated_guest previous_cancellations
## 1 Direct 0 0
## 2 Direct 0 0
## 3 Direct 0 0
## 4 Corporate 0 0
## 5 TA/TO 0 0
## 6 TA/TO 0 0
## previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1 0 C C
## 2 0 C C
## 3 0 A C
## 4 0 A A
## 5 0 A A
## 6 0 A A
## booking_changes deposit_type agent company days_in_waiting_list customer_type
## 1 3 No Deposit NULL NULL 0 Transient
## 2 4 No Deposit NULL NULL 0 Transient
## 3 0 No Deposit NULL NULL 0 Transient
## 4 0 No Deposit 304 NULL 0 Transient
## 5 0 No Deposit 240 NULL 0 Transient
## 6 0 No Deposit 240 NULL 0 Transient
## adr required_car_parking_spaces total_of_special_requests reservation_status
## 1 0 0 0 Check-Out
## 2 0 0 0 Check-Out
## 3 75 0 0 Check-Out
## 4 75 0 0 Check-Out
## 5 98 0 1 Check-Out
## 6 98 0 1 Check-Out
## reservation_status_date
## 1 2015-07-01
## 2 2015-07-01
## 3 2015-07-02
## 4 2015-07-02
## 5 2015-07-03
## 6 2015-07-03
str(df)
## tibble [119,390 × 32] (S3: tbl_df/tbl/data.frame)
## $ hotel : chr [1:119390] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
## $ is_canceled : num [1:119390] 0 0 0 0 0 0 0 0 1 1 ...
## $ lead_time : num [1:119390] 342 737 7 13 14 14 0 9 85 75 ...
## $ arrival_date_year : num [1:119390] 2015 2015 2015 2015 2015 ...
## $ arrival_date_month : chr [1:119390] "July" "July" "July" "July" ...
## $ arrival_date_week_number : num [1:119390] 27 27 27 27 27 27 27 27 27 27 ...
## $ arrival_date_day_of_month : num [1:119390] 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : num [1:119390] 0 0 1 1 2 2 2 2 3 3 ...
## $ adults : num [1:119390] 2 2 1 1 2 2 2 2 2 2 ...
## $ children : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ babies : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ meal : chr [1:119390] "BB" "BB" "BB" "BB" ...
## $ country : chr [1:119390] "PRT" "PRT" "GBR" "GBR" ...
## $ market_segment : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
## $ distribution_channel : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
## $ is_repeated_guest : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_cancellations : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : chr [1:119390] "C" "C" "A" "A" ...
## $ assigned_room_type : chr [1:119390] "C" "C" "C" "A" ...
## $ booking_changes : num [1:119390] 3 4 0 0 0 0 0 0 0 0 ...
## $ deposit_type : chr [1:119390] "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
## $ agent : chr [1:119390] "NULL" "NULL" "NULL" "304" ...
## $ company : chr [1:119390] "NULL" "NULL" "NULL" "NULL" ...
## $ days_in_waiting_list : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : chr [1:119390] "Transient" "Transient" "Transient" "Transient" ...
## $ adr : num [1:119390] 0 0 75 75 98 ...
## $ required_car_parking_spaces : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : num [1:119390] 0 0 0 0 1 1 0 1 1 0 ...
## $ reservation_status : chr [1:119390] "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
## $ reservation_status_date : POSIXct[1:119390], format: "2015-07-01" "2015-07-01" ...
skim_without_charts(df)
| Name | df |
| Number of rows | 119390 |
| Number of columns | 32 |
| _______________________ | |
| Column type frequency: | |
| character | 13 |
| numeric | 18 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| hotel | 0 | 1 | 10 | 12 | 0 | 2 | 0 |
| arrival_date_month | 0 | 1 | 3 | 9 | 0 | 12 | 0 |
| meal | 0 | 1 | 2 | 9 | 0 | 5 | 0 |
| country | 0 | 1 | 2 | 4 | 0 | 178 | 0 |
| market_segment | 0 | 1 | 6 | 13 | 0 | 8 | 0 |
| distribution_channel | 0 | 1 | 3 | 9 | 0 | 5 | 0 |
| reserved_room_type | 0 | 1 | 1 | 1 | 0 | 10 | 0 |
| assigned_room_type | 0 | 1 | 1 | 1 | 0 | 12 | 0 |
| deposit_type | 0 | 1 | 10 | 10 | 0 | 3 | 0 |
| agent | 0 | 1 | 1 | 4 | 0 | 334 | 0 |
| company | 0 | 1 | 1 | 4 | 0 | 353 | 0 |
| customer_type | 0 | 1 | 5 | 15 | 0 | 4 | 0 |
| reservation_status | 0 | 1 | 7 | 9 | 0 | 3 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| is_canceled | 0 | 1 | 0.37 | 0.48 | 0.00 | 0.00 | 0.00 | 1 | 1 |
| lead_time | 0 | 1 | 104.01 | 106.86 | 0.00 | 18.00 | 69.00 | 160 | 737 |
| arrival_date_year | 0 | 1 | 2016.16 | 0.71 | 2015.00 | 2016.00 | 2016.00 | 2017 | 2017 |
| arrival_date_week_number | 0 | 1 | 27.17 | 13.61 | 1.00 | 16.00 | 28.00 | 38 | 53 |
| arrival_date_day_of_month | 0 | 1 | 15.80 | 8.78 | 1.00 | 8.00 | 16.00 | 23 | 31 |
| stays_in_weekend_nights | 0 | 1 | 0.93 | 1.00 | 0.00 | 0.00 | 1.00 | 2 | 19 |
| stays_in_week_nights | 0 | 1 | 2.50 | 1.91 | 0.00 | 1.00 | 2.00 | 3 | 50 |
| adults | 0 | 1 | 1.86 | 0.58 | 0.00 | 2.00 | 2.00 | 2 | 55 |
| children | 4 | 1 | 0.10 | 0.40 | 0.00 | 0.00 | 0.00 | 0 | 10 |
| babies | 0 | 1 | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0 | 10 |
| is_repeated_guest | 0 | 1 | 0.03 | 0.18 | 0.00 | 0.00 | 0.00 | 0 | 1 |
| previous_cancellations | 0 | 1 | 0.09 | 0.84 | 0.00 | 0.00 | 0.00 | 0 | 26 |
| previous_bookings_not_canceled | 0 | 1 | 0.14 | 1.50 | 0.00 | 0.00 | 0.00 | 0 | 72 |
| booking_changes | 0 | 1 | 0.22 | 0.65 | 0.00 | 0.00 | 0.00 | 0 | 21 |
| days_in_waiting_list | 0 | 1 | 2.32 | 17.59 | 0.00 | 0.00 | 0.00 | 0 | 391 |
| adr | 0 | 1 | 101.83 | 50.54 | -6.38 | 69.29 | 94.58 | 126 | 5400 |
| required_car_parking_spaces | 0 | 1 | 0.06 | 0.25 | 0.00 | 0.00 | 0.00 | 0 | 8 |
| total_of_special_requests | 0 | 1 | 0.57 | 0.79 | 0.00 | 0.00 | 0.00 | 1 | 5 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| reservation_status_date | 0 | 1 | 2014-10-17 | 2017-09-14 | 2016-08-07 | 926 |
Only children column has NA values. Since this is just 4 rows we can change the values to 0.
df$children[is.na(df$children)] <- 0
any(is.na(df))
## [1] FALSE
df %>% count(hotel)
## # A tibble: 2 × 2
## hotel n
## <chr> <int>
## 1 City Hotel 79330
## 2 Resort Hotel 40060
Canceled bookings
round(prop.table(table(df$is_canceled)),2)*100
##
## 0 1
## 63 37
63% of bookings get materialised while 37% of all bookings get canceled.
ggplot(df,aes(is_canceled,fill=factor(hotel))) +
geom_histogram(binwidth = 0.2) +
scale_x_continuous(breaks = seq(0, 1, 1)) + theme_bw() +
scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
labs(title = "Distribution of bookings by hotel and cancellation")
plot_ly(df,x=~adr,y=~hotel,type="box", color=~hotel) %>%
layout(title = "Distribution of ADR")
nrow(df[df$adr>5000,])
## [1] 1
Only 1 row has extremely high value of ADR. This can be replaced with the mean of ADR.
df[df$adr>5000,]$adr <- mean(df$adr)
plot_ly(df,x=~adr,y=~hotel,type="box", color=~hotel) %>%
layout(title = "Distribution of ADR")
Median ADR falls into a range of around 80 to 100$. People staying in city hotels are paying a higher ADR compared to resort hotels.
ggplot(df,aes(factor(arrival_date_year), fill=hotel)) + geom_bar(width=0.25) + theme_bw()+
scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
labs(title = "Bookings received per year")
year_df <- df %>% group_by(arrival_date_year = factor(arrival_date_year)) %>%
summarise(adr = mean(adr))
ggplot(year_df,aes(arrival_date_year,adr)) + geom_line(aes(group=1)) + theme_bw() +
labs(title = "Mean ADR by year")
month <-as.factor(df$arrival_date_month)
df$month <- factor(month, levels=c("January","February","March","April","May","June","July","August","September","October","November","December"))
ggplot(df,aes(x=month, fill=hotel)) + geom_bar() + theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) +
scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
labs(title = "Bookings by month")
ggplot(df,aes(x=month, y=adr, fill=hotel)) + geom_col(position="dodge") + theme_bw() +
scale_x_discrete(guide = guide_axis(angle = 45)) +
scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
labs(title = "ADR by month")
Bookings received are highest in month of July and August and similarly ADR is higher for those months for the resort hotel. However City hotels have a higher ADR in the months of May and December.
ggplot(df,aes(arrival_date_day_of_month, fill=hotel)) + geom_bar() + theme_bw()+
scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
labs(title = "Bookings received by date of month")
No discernible pattern seems to be there for bookings by date of month.
fig <- plot_ly(df,x=~lead_time,y=~factor(is_canceled),type="box", color=~factor(is_canceled))
fig %>% layout(title = "Booking cancellations by lead time",yaxis=list(title="Cancelled"))
weekend_stay <- df$stays_in_weekend_nights
weekday_stay <- df$stays_in_week_nights
df$length_of_stay <- weekend_stay + weekday_stay
ggplot(df[df$length_of_stay<=15,],aes(x=factor(length_of_stay),fill=hotel)) +
geom_bar(position="dodge") +
scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
labs(title = "Average Length of stay") + theme_bw()
Length of stay for majority of the bookings average between 1-4 days. Duration of 5 days and under have higher frequency for city hotel bookings. However for longer duration stays, resort hotels are more frequently chosen.
df$child <- ifelse(df$children>1 | df$babies>1,"Yes","No")
plot_ly(df,x=~adr,y=~child,type="box", color=~child) %>%
layout(title = "ADR by children present or not")
ggplot(df,aes(market_segment,fill=factor(is_canceled))) + geom_bar() +
facet_wrap(~hotel,ncol=1) + theme_bw() + theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) + scale_fill_manual(values=c("#8da0cb", "#66c2a4")) +
labs(title = "Bookings distribution by Market Segment")
ggplot(df,aes(reservation_status,fill=deposit_type)) + geom_bar(width=0.25) + theme_bw() +
theme_bw() + labs(title = "Reservation status of bookings") +
scale_fill_manual(values=c("#8da0cb", "#66c2a4","#FF0000"))
Splitting train and test set
df <- df %>% mutate(arrival_date_month=as.factor(arrival_date_month))
set.seed(42)
sample <- sample.split(df$is_canceled,SplitRatio=0.8)
train <- subset(df,sample==T)
test <- subset(df,sample==F)
Since we are trying to predict whether a booking will get canceled, this is a binary categorisation. Here I am using logistic regression to train the model.
model <- glm(is_canceled~lead_time + arrival_date_year +
arrival_date_month + arrival_date_week_number +
previous_cancellations + adr + deposit_type + customer_type + length_of_stay +
child, family=binomial(logit),train,na.action = na.exclude)
summary(model)
##
## Call:
## glm(formula = is_canceled ~ lead_time + arrival_date_year + arrival_date_month +
## arrival_date_week_number + previous_cancellations + adr +
## deposit_type + customer_type + length_of_stay + child, family = binomial(logit),
## data = train, na.action = na.exclude)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 29.4726193 31.3879049 0.939 0.347741
## lead_time 0.0043683 0.0001002 43.586 < 2e-16 ***
## arrival_date_year -0.0157016 0.0155624 -1.009 0.313003
## arrival_date_monthAugust 0.1245163 0.1095711 1.136 0.255790
## arrival_date_monthDecember 0.7913833 0.2142867 3.693 0.000222 ***
## arrival_date_monthFebruary -0.1262907 0.0656223 -1.925 0.054291 .
## arrival_date_monthJanuary -0.4253279 0.0907765 -4.685 2.79e-06 ***
## arrival_date_monthJuly 0.0194341 0.0850917 0.228 0.819343
## arrival_date_monthJune -0.0629679 0.0636098 -0.990 0.322219
## arrival_date_monthMarch -0.2379047 0.0468777 -5.075 3.87e-07 ***
## arrival_date_monthMay -0.0421654 0.0442397 -0.953 0.340533
## arrival_date_monthNovember 0.5401268 0.1883691 2.867 0.004139 **
## arrival_date_monthOctober 0.4450741 0.1614365 2.757 0.005834 **
## arrival_date_monthSeptember 0.1385236 0.1367823 1.013 0.311189
## arrival_date_week_number -0.0232603 0.0060068 -3.872 0.000108 ***
## previous_cancellations 1.4014360 0.0444249 31.546 < 2e-16 ***
## adr 0.0051710 0.0002030 25.473 < 2e-16 ***
## deposit_typeNon Refund 5.4840645 0.1198055 45.775 < 2e-16 ***
## deposit_typeRefundable -0.0488824 0.2318366 -0.211 0.833005
## customer_typeGroup -0.3205979 0.1629326 -1.968 0.049106 *
## customer_typeTransient 0.8533876 0.0516618 16.519 < 2e-16 ***
## customer_typeTransient-Party 0.1631544 0.0537708 3.034 0.002411 **
## length_of_stay 0.0287689 0.0030689 9.374 < 2e-16 ***
## childYes 0.1604933 0.0417433 3.845 0.000121 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 125918 on 95511 degrees of freedom
## Residual deviance: 94861 on 95488 degrees of freedom
## AIC: 94909
##
## Number of Fisher Scoring iterations: 7
test$status <- predict(model,test,type="response")
test$status2 <- ifelse(test$status>0.5,1,0)
confusionMatrix(as.factor(test$status2),as.factor(test$is_canceled))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 14614 5216
## 1 419 3629
##
## Accuracy : 0.764
## 95% CI : (0.7586, 0.7694)
## No Information Rate : 0.6296
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4305
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9721
## Specificity : 0.4103
## Pos Pred Value : 0.7370
## Neg Pred Value : 0.8965
## Prevalence : 0.6296
## Detection Rate : 0.6120
## Detection Prevalence : 0.8305
## Balanced Accuracy : 0.6912
##
## 'Positive' Class : 0
##